###source("/home/dalton_m/ppp/pyfiles/ANldb_pppv6.R")
#### nohup R CMD BATCH /home/dalton_m/ppp/pyfiles/ANldb_pppv6.R | tee &

tdate <- format(Sys.time(), "%Y%m%d")
resultsloc <- paste0("/dataERS/eract/daltonm/results/ppp/",tdate)
if (!dir.exists(resultsloc)) {dir.create(resultsloc)}

resultsloc <- paste0(resultsloc,"/")
dataloc <- "/dataERS/daltonm/pppfiles/"


library('fastDummies')
library(dplyr)

filename = 'ANldb_pppv7'
tfname = paste0(filename,'.txt')

sink(paste0(resultsloc,tfname), append=TRUE)


dataname <- "ANppp_ldbv1.csv"
#dataname <- "ANnberv5_small.csv"
# # Importing the dataset
dataset = read.csv(paste0(dataloc,dataname))
#dataset <- dataset[sample(1:nrow(dataset), 100000), ]
library(sqldf)
dataset <- read.csv.sql(paste0(dataloc,dataname), "select * from file where D10pctsample = 1")
dataset <- dataset[(dataset['emp_max_19'] >= 2),]
#dataset <- dataset[(dataset['cutoff_score'] <= 1.5),]
#dataset <- dataset[sample(1:nrow(dataset), 100000), ]

dataset <- dataset[(dataset['pct_emp'] <= 10000) & (dataset['pct_wage'] <= 10000),]

dataset$pct_emp_diff = dataset$pct_emp - 100*dataset$cntynaics_emp
dataset$pct_wage_diff = dataset$pct_wage - 100*dataset$cntynaics_wage

dataset <- dataset[(dataset['pct_emp_diff'] <= 10000) & (dataset['pct_emp_diff'] >= -1000) & (dataset['pct_wage_diff'] <= 10000) & (dataset['pct_wage_diff'] >= -1000),]




dataset$yr_growth <- ifelse(is.na(dataset$yr_growth),0,dataset$yr_growth)
#dataset$pct_emp <- 100 * dataset$emp_m_fill / dataset$aaemp
#dataset$pct_emp_missing <- 100 * dataset$emp_m / dataset$aaemp
#dataset$Dmulti <- ifelse(dataset$ein_aaemp>dataset$aaemp, 1, 0)
dataset$Ddecline <- ifelse(dataset$yr_growth < 0, 1, 0)
dataset$Dgrowth <- ifelse(dataset$yr_growth > 0, 1, 0)





rhs1 = c('Deligible',
         'Ddecline',
         'Dgrowth',
         #'Dmulti',
         'yr_growth',
         'Dfranchise',
         # 'bank_dist_bins_0',
         # 'bank_dist_bins_1',
         # 'bank_dist_bins_2',
         #'bank_dist_bins_3',
#'janmar_emp_ratio', 'aprjun_emp_ratio', 'julsep_emp_ratio', 'octdec_emp_ratio',
#        'janmar_wages_ratio', 'aprjun_wages_ratio', 'julsep_wages_ratio', 'octdec_wages_ratio',
'jan_closed',
 'feb_closed',
 'mar_closed',
 'apr_closed',
 'may_closed',
 'jun_closed',
 'jul_closed',
 'aug_closed',
 'sep_closed',
 'oct_closed',
 'nov_closed',
 'dec_closed',
        'D_eidl_grant', 'D_eidl_loan', 'Dppp2021'
        )



#dataset$fipscnty<- relevel(as.factor(dataset$fipscnty), ref = "35001")
dataset$size_class_multi<- relevel(as.factor(dataset$size_class_multi), ref = "m100")
dataset$ein_size_class<- relevel(as.factor(dataset$ein_size_class), ref = "5000")
dataset$avg_wages_bin<- relevel(as.factor(dataset$avg_wages_bin), ref = "1")
dataset$age_bins<- relevel(as.factor(dataset$age_bins), ref = "21")
dataset$naics2<- relevel(as.factor(dataset$naics2), ref = "81")
dataset$urban_classification<- relevel(as.factor(dataset$urban_classification), ref = "7")
#'fipscnty',
catvals <- c('naics2', 'age_bins','avg_wages_bin', 'urban_classification',
         'ein_size_class', 'size_class_multi')
rhs <- c(rhs1,catvals )





form <- as.formula(paste("~ ",paste(rhs,collapse = "+"), sep=""))



form_basic <- as.formula('~1')

#########separate regressions
#,'Dppp_employment_verify'
depvars <- c('pct_emp',  'pct_wage', 'Dclosed')



library(dplyr)
library("clubSandwich")
#https://cran.r-project.org/web/packages/clubSandwich/vignettes/panel-data-CRVE.html
library("broom")
library("dplyr")

############overall
  tmean <- colMeans(subset(dataset,select=rhs1))
  rhs2a <- names(which( tmean > .002))
  tmean <- colMeans(subset(dataset,select=rhs2a))
  tnum <-
  # rhs2 <- names(which(colMeans(subset(dataset[(dataset[[keyvariable]] == i),],select=rhs2)) < .99 & ))
  rhs2 <- list()
  for(j in 1:length(rhs2a)) {
      tvar <- rhs2a[j]
      if (((tmean[j] < .999) & (n_distinct(dataset[tvar] <= 2))) | ((tmean[j] >= .999) & (n_distinct(dataset[tvar] > 2)))){
  rhs2[length(rhs2) + 1] <- tvar    # Append new list element
}
}

rhs2 <- unlist(rhs2)
  form <- as.formula(paste("~ ",paste(rhs2,collapse = "+"), "+", paste(catvals,collapse = "+"), sep=""))




  for (depvar in depvars){
    ddepvar = paste0('D',depvar)
        print(depvar)
    print(nrow(dataset[(dataset[[ddepvar]] == 1),]))
    dataset$weight <- dataset$aaemp_19 / max(dataset[(dataset[[ddepvar]] == 1),]$aaemp_19)
    if (depvar != 'Dclosed') {
  depvar = paste0(depvar,'_diff')
}

  out1f <- att_gt(yname = depvar,
                  gname = "ppp_ein_month",
                  idname = "ldb_num",
                  tname = "num_month",
                  xformla = form,
                  data = dataset[(dataset[[ddepvar]] == 1),],
                  est_method = "dr",
                  bstrap = TRUE,
                  biters = 1000,
                  #print_details = TRUE,
                  clustervars = c("ldb_num", 'fipscnty'),
                  panel = TRUE,
                  control_group = "notyettreated",
                  anticipation = 1,
                  pl = TRUE,
                  cores = 2,
                  #weightsname = "weight"
  )

  print(Sys.time())
  print(paste("RESULTS BELOW: preferred results - ",depvar," - control variables - EIN month - ",'overall', sep=': '))
  print(form)
  summary(out1f)

    resultsname = paste(filename, depvar,  'main', sep='_')
    saveRDS(out1f,paste0(resultsloc, resultsname))

  es <- aggte(out1f, type = "dynamic", na.rm=TRUE, min_e = -5, max_e = 15)
  summary(es)
  es <- aggte(out1f, type = "group", na.rm=TRUE)
  summary(es)
      es <- aggte(out1f, type = "calendar", na.rm=TRUE)
  summary(es)


    print(Sys.time())
  print(paste("RESULTS ABOVE: preferred results - ",depvar," - control variables - EIN month - ",'overall', sep=': '))
}


#######i only want basic to be without difference in dep var

  for (depvar in depvars){
    ddepvar = paste0('D',depvar)
    #dataset$weight <- (dataset$aaemp_19 - mean(dataset[(dataset[[ddepvar]] == 1),]$aaemp_19)) /sd(dataset[(dataset[[ddepvar]] == 1),]$aaemp_19
    #depvar = paste0(depvar,'_diff')
  out1f <- att_gt(yname = depvar,
                  gname = "ppp_ein_month",
                  idname = "ldb_num",
                  tname = "num_month",
                  xformla = form_basic,
                  data = dataset[(dataset[[ddepvar]] == 1),],
                  est_method = "dr",
                  bstrap = TRUE,
                  biters = 1000,
                  #print_details = TRUE,
                  clustervars = c("ldb_num", 'fipscnty'),
                  panel = TRUE,
                  control_group = "notyettreated",
                  anticipation = 1,
                  pl = TRUE,
                  cores = 2,
  )

  print(Sys.time())
  print(paste("RESULTS below: preferred results - ",depvar," - control variables - EIN month - ",'overall', sep=': '))
      print('')
  print(form_basic)
  summary(out1f)

    resultsname = paste(depvar,  'main', sep='_')
    #saveRDS(out1f,paste0(resultsloc, resultsname))

  es <- aggte(out1f, type = "dynamic", na.rm=TRUE, min_e = -5, max_e = 15)
  summary(es)
  es <- aggte(out1f, type = "group", na.rm=TRUE)
  summary(es)
      es <- aggte(out1f, type = "calendar", na.rm=TRUE)
  summary(es)
  print(paste("RESULTS ABOVE: preferred results - ",depvar," - control variables - EIN month - ",'overall', sep=': '))
}


